In [1]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
In [2]:
conn = sqlite3.connect("olist.sqlite")
cur= conn.cursor()
In [3]:
query = "SELECT name FROM sqlite_master WHERE type='table'"
cur.execute(query)

cur.fetchall()
Out[3]:
[('product_category_name_translation',),
 ('sellers',),
 ('customers',),
 ('geolocation',),
 ('order_items',),
 ('order_payments',),
 ('order_reviews',),
 ('orders',),
 ('products',),
 ('leads_qualified',),
 ('leads_closed',)]
In [4]:
query = "PRAGMA table_info(order_items)"
pd.read_sql(query, conn)
Out[4]:
cid name type notnull dflt_value pk
0 0 order_id TEXT 0 None 0
1 1 order_item_id INTEGER 0 None 0
2 2 product_id TEXT 0 None 0
3 3 seller_id TEXT 0 None 0
4 4 shipping_limit_date TEXT 0 None 0
5 5 price REAL 0 None 0
6 6 freight_value REAL 0 None 0
In [5]:
query = "SELECT COUNT(*) FROM order_items;"
pd.read_sql(query, conn)
Out[5]:
COUNT(*)
0 112650
In [6]:
query = "SELECT * FROM order_items LIMIT 10;"
pd.read_sql(query, conn)
Out[6]:
order_id order_item_id product_id seller_id shipping_limit_date price freight_value
0 00010242fe8c5a6d1ba2dd792cb16214 1 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 2017-09-19 09:45:35 58.90 13.29
1 00018f77f2f0320c557190d7a144bdd3 1 e5f2d52b802189ee658865ca93d83a8f dd7ddc04e1b6c2c614352b383efe2d36 2017-05-03 11:05:13 239.90 19.93
2 000229ec398224ef6ca0657da4fc703e 1 c777355d18b72b67abbeef9df44fd0fd 5b51032eddd242adc84c38acab88f23d 2018-01-18 14:48:30 199.00 17.87
3 00024acbcdf0a6daa1e931b038114c75 1 7634da152a4610f1595efa32f14722fc 9d7a1d34a5052409006425275ba1c2b4 2018-08-15 10:10:18 12.99 12.79
4 00042b26cf59d7ce69dfabb4e55b4fd9 1 ac6c3623068f30de03045865e4e10089 df560393f3a51e74553ab94004ba5c87 2017-02-13 13:57:51 199.90 18.14
5 00048cc3ae777c65dbb7d2a0634bc1ea 1 ef92defde845ab8450f9d70c526ef70f 6426d21aca402a131fc0a5d0960a3c90 2017-05-23 03:55:27 21.90 12.69
6 00054e8431b9d7675808bcb819fb4a32 1 8d4f2bb7e93e6710a28f34fa83ee7d28 7040e82f899a04d1b434b795a43b4617 2017-12-14 12:10:31 19.90 11.85
7 000576fe39319847cbb9d288c5617fa6 1 557d850972a7d6f792fd18ae1400d9b6 5996cddab893a4652a15592fb58ab8db 2018-07-10 12:30:45 810.00 70.75
8 0005a1a1728c9d785b8e2b08b904576c 1 310ae3c140ff94b03219ad0adc3c778f a416b6a846a11724393025641d4edd5e 2018-03-26 18:31:29 145.95 11.65
9 0005f50442cb953dcd1d21e1fb923495 1 4535b0e1091c278dfd193e5a1d63b39f ba143b05f0110f0dc71ad71b4466ce92 2018-07-06 14:10:56 53.99 11.40
In [7]:
query = "PRAGMA table_info(product_category_name_translation)"
pd.read_sql(query, conn)
Out[7]:
cid name type notnull dflt_value pk
0 0 product_category_name TEXT 0 None 0
1 1 product_category_name_english TEXT 0 None 0
In [8]:
query = "SELECT * FROM product_category_name_translation LIMIT 5;"
pd.read_sql(query, conn)
Out[8]:
product_category_name product_category_name_english
0 beleza_saude health_beauty
1 informatica_acessorios computers_accessories
2 automotivo auto
3 cama_mesa_banho bed_bath_table
4 moveis_decoracao furniture_decor
In [9]:
query = "PRAGMA table_info(sellers)"
pd.read_sql(query, conn)
Out[9]:
cid name type notnull dflt_value pk
0 0 seller_id TEXT 0 None 0
1 1 seller_zip_code_prefix INTEGER 0 None 0
2 2 seller_city TEXT 0 None 0
3 3 seller_state TEXT 0 None 0
In [10]:
query = "SELECT * FROM sellers LIMIT 5;"
pd.read_sql(query, conn)
Out[10]:
seller_id seller_zip_code_prefix seller_city seller_state
0 3442f8959a84dea7ee197c632cb2df15 13023 campinas SP
1 d1b65fc7debc3361ea86b5f14c68d2e2 13844 mogi guacu SP
2 ce3ad9de960102d0677a81f5d0bb7b2d 20031 rio de janeiro RJ
3 c0f3eea2e14555b6faeea3dd58c1b1c3 4195 sao paulo SP
4 51a04a8a6bdcb23deccc82b0b80742cf 12914 braganca paulista SP
In [11]:
query = "PRAGMA table_info(customers)"
pd.read_sql(query, conn)
Out[11]:
cid name type notnull dflt_value pk
0 0 customer_id TEXT 0 None 0
1 1 customer_unique_id TEXT 0 None 0
2 2 customer_zip_code_prefix INTEGER 0 None 0
3 3 customer_city TEXT 0 None 0
4 4 customer_state TEXT 0 None 0
In [12]:
query = "SELECT * FROM customers LIMIT 5;"
pd.read_sql(query, conn)
Out[12]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP
In [13]:
query = "PRAGMA table_info(geolocation)"
pd.read_sql(query, conn)
Out[13]:
cid name type notnull dflt_value pk
0 0 geolocation_zip_code_prefix INTEGER 0 None 0
1 1 geolocation_lat REAL 0 None 0
2 2 geolocation_lng REAL 0 None 0
3 3 geolocation_city TEXT 0 None 0
4 4 geolocation_state TEXT 0 None 0
In [14]:
query = "SELECT * FROM geolocation LIMIT 5;"
pd.read_sql(query, conn)
Out[14]:
geolocation_zip_code_prefix geolocation_lat geolocation_lng geolocation_city geolocation_state
0 1037 -23.545621 -46.639292 sao paulo SP
1 1046 -23.546081 -46.644820 sao paulo SP
2 1046 -23.546129 -46.642951 sao paulo SP
3 1041 -23.544392 -46.639499 sao paulo SP
4 1035 -23.541578 -46.641607 sao paulo SP
In [15]:
query = "PRAGMA table_info(order_payments)"
pd.read_sql(query, conn)
Out[15]:
cid name type notnull dflt_value pk
0 0 order_id TEXT 0 None 0
1 1 payment_sequential INTEGER 0 None 0
2 2 payment_type TEXT 0 None 0
3 3 payment_installments INTEGER 0 None 0
4 4 payment_value REAL 0 None 0
In [16]:
query = "SELECT * FROM order_payments LIMIT 10;"
pd.read_sql(query, conn)
Out[16]:
order_id payment_sequential payment_type payment_installments payment_value
0 b81ef226f3fe1789b1e8b2acac839d17 1 credit_card 8 99.33
1 a9810da82917af2d9aefd1278f1dcfa0 1 credit_card 1 24.39
2 25e8ea4e93396b6fa0d3dd708e76c1bd 1 credit_card 1 65.71
3 ba78997921bbcdc1373bb41e913ab953 1 credit_card 8 107.78
4 42fdf880ba16b47b59251dd489d4441a 1 credit_card 2 128.45
5 298fcdf1f73eb413e4d26d01b25bc1cd 1 credit_card 2 96.12
6 771ee386b001f06208a7419e4fc1bbd7 1 credit_card 1 81.16
7 3d7239c394a212faae122962df514ac7 1 credit_card 3 51.84
8 1f78449c87a54faf9e96e88ba1491fa9 1 credit_card 6 341.09
9 0573b5e23cbd798006520e1d5b4c6714 1 boleto 1 51.95
In [17]:
query = "PRAGMA table_info(order_reviews)"
pd.read_sql(query, conn)
Out[17]:
cid name type notnull dflt_value pk
0 0 review_id TEXT 0 None 0
1 1 order_id TEXT 0 None 0
2 2 review_score INTEGER 0 None 0
3 3 review_comment_title TEXT 0 None 0
4 4 review_comment_message TEXT 0 None 0
5 5 review_creation_date TEXT 0 None 0
6 6 review_answer_timestamp TEXT 0 None 0
In [18]:
query = "SELECT * FROM order_reviews LIMIT 5;"
pd.read_sql(query, conn)
Out[18]:
review_id order_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp
0 7bc2406110b926393aa56f80a40eba40 73fc7af87114b39712e6da79b0a377eb 4 None None 2018-01-18 00:00:00 2018-01-18 21:46:59
1 80e641a11e56f04c1ad469d5645fdfde a548910a1c6147796b98fdf73dbeba33 5 None None 2018-03-10 00:00:00 2018-03-11 03:05:13
2 228ce5500dc1d8e020d8d1322874b6f0 f9e4b658b201a9f2ecdecbb34bed034b 5 None None 2018-02-17 00:00:00 2018-02-18 14:36:24
3 e64fb393e7b32834bb789ff8bb30750e 658677c97b385a9be170737859d3511b 5 None Recebi bem antes do prazo estipulado. 2017-04-21 00:00:00 2017-04-21 22:02:06
4 f7c4243c7fe1938f181bec41a392bdeb 8e6bfb81e283fa7e4f11123a3fb894f1 5 None Parabéns lojas lannister adorei comprar pela I... 2018-03-01 00:00:00 2018-03-02 10:26:53
In [19]:
query = "PRAGMA table_info(orders)"
pd.read_sql(query, conn)
Out[19]:
cid name type notnull dflt_value pk
0 0 order_id TEXT 0 None 0
1 1 customer_id TEXT 0 None 0
2 2 order_status TEXT 0 None 0
3 3 order_purchase_timestamp TEXT 0 None 0
4 4 order_approved_at TEXT 0 None 0
5 5 order_delivered_carrier_date TEXT 0 None 0
6 6 order_delivered_customer_date TEXT 0 None 0
7 7 order_estimated_delivery_date TEXT 0 None 0
In [20]:
query = "SELECT * FROM orders LIMIT 5;"
pd.read_sql(query, conn)
Out[20]:
order_id customer_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 00:00:00
1 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13 00:00:00
2 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04 00:00:00
3 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59 2017-11-22 13:39:59 2017-12-02 00:28:42 2017-12-15 00:00:00
4 ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c delivered 2018-02-13 21:18:39 2018-02-13 22:20:29 2018-02-14 19:46:34 2018-02-16 18:17:02 2018-02-26 00:00:00
In [21]:
query = "PRAGMA table_info(products)"
pd.read_sql(query, conn)
Out[21]:
cid name type notnull dflt_value pk
0 0 product_id TEXT 0 None 0
1 1 product_category_name TEXT 0 None 0
2 2 product_name_lenght REAL 0 None 0
3 3 product_description_lenght REAL 0 None 0
4 4 product_photos_qty REAL 0 None 0
5 5 product_weight_g REAL 0 None 0
6 6 product_length_cm REAL 0 None 0
7 7 product_height_cm REAL 0 None 0
8 8 product_width_cm REAL 0 None 0
In [22]:
query = "SELECT * FROM products LIMIT 5;"
pd.read_sql(query, conn)
Out[22]:
product_id product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm
0 1e9e8ef04dbcff4541ed26657ea517e5 perfumaria 40.0 287.0 1.0 225.0 16.0 10.0 14.0
1 3aa071139cb16b67ca9e5dea641aaa2f artes 44.0 276.0 1.0 1000.0 30.0 18.0 20.0
2 96bd76ec8810374ed1b65e291975717f esporte_lazer 46.0 250.0 1.0 154.0 18.0 9.0 15.0
3 cef67bcfe19066a932b7673e239eb23d bebes 27.0 261.0 1.0 371.0 26.0 4.0 26.0
4 9dc1a7de274444849c219cff195d0b71 utilidades_domesticas 37.0 402.0 4.0 625.0 20.0 17.0 13.0
In [23]:
query = "PRAGMA table_info(leads_qualified)"
pd.read_sql(query, conn)
Out[23]:
cid name type notnull dflt_value pk
0 0 mql_id TEXT 0 None 0
1 1 first_contact_date TEXT 0 None 0
2 2 landing_page_id TEXT 0 None 0
3 3 origin TEXT 0 None 0
In [24]:
query = "SELECT * FROM leads_qualified LIMIT 5;"
pd.read_sql(query, conn)
Out[24]:
mql_id first_contact_date landing_page_id origin
0 dac32acd4db4c29c230538b72f8dd87d 2018-02-01 88740e65d5d6b056e0cda098e1ea6313 social
1 8c18d1de7f67e60dbd64e3c07d7e9d5d 2017-10-20 007f9098284a86ee80ddeb25d53e0af8 paid_search
2 b4bc852d233dfefc5131f593b538befa 2018-03-22 a7982125ff7aa3b2054c6e44f9d28522 organic_search
3 6be030b81c75970747525b843c1ef4f8 2018-01-22 d45d558f0daeecf3cccdffe3c59684aa email
4 5420aad7fec3549a85876ba1c529bd84 2018-02-21 b48ec5f3b04e9068441002a19df93c6c organic_search
In [ ]:
 
In [25]:
query = "PRAGMA table_info(leads_closed)"
pd.read_sql(query, conn)
Out[25]:
cid name type notnull dflt_value pk
0 0 mql_id TEXT 0 None 0
1 1 seller_id TEXT 0 None 0
2 2 sdr_id TEXT 0 None 0
3 3 sr_id TEXT 0 None 0
4 4 won_date TEXT 0 None 0
5 5 business_segment TEXT 0 None 0
6 6 lead_type TEXT 0 None 0
7 7 lead_behaviour_profile TEXT 0 None 0
8 8 has_company INTEGER 0 None 0
9 9 has_gtin INTEGER 0 None 0
10 10 average_stock TEXT 0 None 0
11 11 business_type TEXT 0 None 0
12 12 declared_product_catalog_size REAL 0 None 0
13 13 declared_monthly_revenue REAL 0 None 0
In [26]:
query = "SELECT * FROM leads_closed LIMIT 10;"
pd.read_sql(query, conn)
Out[26]:
mql_id seller_id sdr_id sr_id won_date business_segment lead_type lead_behaviour_profile has_company has_gtin average_stock business_type declared_product_catalog_size declared_monthly_revenue
0 5420aad7fec3549a85876ba1c529bd84 2c43fb513632d29b3b58df74816f1b06 a8387c01a09e99ce014107505b92388c 4ef15afb4b2723d8f3d81e51ec7afefe 2018-02-26 19:58:54 pet online_medium cat None None None reseller None 0.0
1 a555fb36b9368110ede0f043dfc3b9a0 bbb7d7893a450660432ea6652310ebb7 09285259593c61296eef10c734121d5b d3d1e91a157ea7f90548eef82f1955e3 2018-05-08 20:17:59 car_accessories industry eagle None None None reseller None 0.0
2 327174d3648a2d047e8940d7d15204ca 612170e34b97004b3ba37eae81836b4c b90f87164b5f8c2cfa5c8572834dbe3f 6565aa9ce3178a5caf6171827af3a9ba 2018-06-05 17:27:23 home_appliances online_big cat None None None reseller None 0.0
3 f5fee8f7da74f4887f5bcae2bafb6dd6 21e1781e36faf92725dde4730a88ca0f 56bf83c4bb35763a51c2baab501b4c67 d3d1e91a157ea7f90548eef82f1955e3 2018-01-17 13:51:03 food_drink online_small None None None None reseller None 0.0
4 ffe640179b554e295c167a2f6be528e0 ed8cb7b190ceb6067227478e48cf8dde 4b339f9567d060bcea4f5136b9f5949e d3d1e91a157ea7f90548eef82f1955e3 2018-07-03 20:17:45 home_appliances industry wolf None None None manufacturer None 0.0
5 b94fba7670eeb44dce2a0d8eb790e9f5 1c742ac33582852aaf3bcfbf5893abcf fdb16d3cbbeb5798f2f66c4096be026d 495d4e95a8cf8bbf8b432b612a2aa328 2018-02-07 18:04:05 health_beauty online_medium None None None None manufacturer None 0.0
6 c3e30ed7ac989117c7e1e719b4ac128f 92d7568ad0c5c76fd7d341b2d46f24d6 4b339f9567d060bcea4f5136b9f5949e 85fc447d336637ba1df43e793199fbc8 2018-04-16 18:18:22 computers online_medium wolf None None None reseller None 0.0
7 b02c89251106e1fdd9d92744be9f94f2 44ed138eca6214d572ce1d813fb0049b 34d40cdaf94010a1d05b0d6212f9e909 4ef15afb4b2723d8f3d81e51ec7afefe 2018-04-17 17:01:57 health_beauty offline cat None None None manufacturer None 0.0
8 a90a37898cc5f2718385a2fb981caaff 0b28859cd04d23edefee9c591fb03cd8 f42a2bd194f7802ab052a815c8de65b7 6565aa9ce3178a5caf6171827af3a9ba 2018-05-14 18:37:15 household_utilities offline eagle None None None reseller None 0.0
9 0173e8d8b1d94a355b440fb67388f532 87d73636a3acf123e842bb890a4db036 9d12ef1a7eca3ec58c545c678af7869c 9e4d1098a3b0f5da39b0bc48f9876645 2018-04-24 03:00:00 household_utilities online_medium eagle None None None manufacturer None 0.0

Products with highest revenues¶

  • The query below helps to identify the top 10 products generating the highest revenue.
  • As a result we understand that product with product_id = 'bb50f2e236e5eea0100680137654686c' generated the highest revenue (63,885.00) from 195 sales.

This product is likely one of the highest-priced or most frequently purchased items.

In [28]:
query = """

SELECT product_id, SUM(price), COUNT(product_id) FROM order_items
GROUP BY product_id
ORDER BY SUM(price) DESC

LIMIT 10 """

cur.execute(query)
cur.fetchall()
Out[28]:
[('bb50f2e236e5eea0100680137654686c', 63885.0, 195),
 ('6cdd53843498f92890544667809f1595', 54730.200000000106, 156),
 ('d6160fb7873f184099d9bc95e30376af', 48899.34, 35),
 ('d1c427060a0f73f6b889a5c7c61f2ac4', 47214.50999999998, 343),
 ('99a4788cb24856965c36a24e339b6058', 43025.56000000037, 488),
 ('3dd2a17168ec895c781a9191c1e95ad7', 41082.60000000021, 274),
 ('25c38557cf793876c5abdd5931f922db', 38907.32000000001, 38),
 ('5f504b3a1c75b73d6151be81eb05bdc9', 37733.90000000001, 63),
 ('53b36df67ebb7c41585e8d54d6772e08', 37683.42000000013, 323),
 ('aca2eb7d00ea1a7b8ebd4e68314663af', 37608.900000000314, 527)]

10 most frequently ordered products, regardless of revenue.¶

  • The query below highlights the top 10 most frequently ordered products, regardless of revenue.
  • The product with product_id = 'aca2eb7d00ea1a7b8ebd4e68314663af' has the highest order count (527), generating 37,608.00 in revenue.

This product might be a low-cost or essential item that sells in large volumes.

In [31]:
query = """

SELECT product_id, SUM(price), COUNT(product_id) FROM order_items
GROUP BY product_id
ORDER BY COUNT(product_id) DESC

LIMIT 10 """

cur.execute(query)
cur.fetchall()
Out[31]:
[('aca2eb7d00ea1a7b8ebd4e68314663af', 37608.900000000314, 527),
 ('99a4788cb24856965c36a24e339b6058', 43025.56000000037, 488),
 ('422879e10f46682990de24d770e7f83d', 26577.22000000016, 484),
 ('389d119b48cf3043d311335e499d9c6b', 21440.59000000004, 392),
 ('368c6c730842d78016ad823897a372db', 21056.80000000004, 388),
 ('53759a2ecddad2bb87a079a1f1519f73', 20387.200000000033, 373),
 ('d1c427060a0f73f6b889a5c7c61f2ac4', 47214.50999999998, 343),
 ('53b36df67ebb7c41585e8d54d6772e08', 37683.42000000013, 323),
 ('154e7e31ebfa092203795c972e5804a6', 6325.18999999996, 281),
 ('3dd2a17168ec895c781a9191c1e95ad7', 41082.60000000021, 274)]
  • We can conclude that Products with high revenue don't necessarily have the highest order count, and vice versa.
  • For example:

Product 'bb50f2e236e5eea0100680137654686c' tops revenue but is only ordered 195 times. Product 'aca2eb7d00ea1a7b8ebd4e68314663af' has the most orders but ranks lower in revenue. This indicates that expensive products contribute more to revenue even with fewer sales, while cheaper products dominate order counts.

Recommendations: For high-revenue products focus marketing efforts on premium buyers and ensure stock availability. For high-order-count products explore upselling or bundling strategies to increase revenue.


In [ ]:
 

Product Categories with highest Revenue¶

  • The query below calculates the total revenue total orders for each product category.Categories are ranked in descending order of revenue, returning the top 10.
  • We undesrtand that the top category is 'beleza_saude' (health and beauty), generating a total revenue of 1,258,681.34 from 9,670 orders. This suggests that health and beauty products are highly profitable.

'relogios_presentes' (watches and gifts) ranks second with 1,205,005.68 in revenue from 5,991 orders, indicating a smaller but higher-priced product set. 'cama_mesa_banho' (bed, bath, and table) has the most orders (11,115) among the top 3 categories, but it generates slightly less revenue (1,036,988.68) than the other two. This implies these products might be lower in price but sell in higher quantities.

In [35]:
query = """

SELECT products.product_category_name, SUM(order_items.price), COUNT(*) 
FROM order_items 
JOIN products ON products.product_id = order_items.product_id
GROUP BY products.product_category_name
ORDER BY SUM(order_items.price) DESC

LIMIT 10 """

cur.execute(query)
cur.fetchall()
Out[35]:
[('beleza_saude', 1258681.3399999682, 9670),
 ('relogios_presentes', 1205005.6799999995, 5991),
 ('cama_mesa_banho', 1036988.6800000712, 11115),
 ('esporte_lazer', 988048.9700000401, 8641),
 ('informatica_acessorios', 911954.3200000388, 7827),
 ('moveis_decoracao', 729762.4900000411, 8334),
 ('cool_stuff', 635290.8500000009, 3796),
 ('utilidades_domesticas', 632248.6600000213, 6964),
 ('automotivo', 592720.1100000107, 4235),
 ('ferramentas_jardim', 485256.46000001475, 4347)]

Product Categories with highest Revenue in English¶

  • The query below a similar query to query above but includes a JOIN with the product_category_name_translation table to get English translations of category names.
  • The addition of English names makes the results more client-friendly for non-Portuguese speakers.
In [38]:
query = """

SELECT product_category_name_translation.product_category_name_english, ROUND(SUM(order_items.price),2), COUNT(*) 
FROM order_items 
JOIN products ON products.product_id = order_items.product_id
JOIN product_category_name_translation ON product_category_name_translation.product_category_name = products.product_category_name
GROUP BY products.product_category_name
ORDER BY SUM(order_items.price) DESC

LIMIT 10 """

cur.execute(query)
cur.fetchall()
Out[38]:
[('health_beauty', 1258681.34, 9670),
 ('watches_gifts', 1205005.68, 5991),
 ('bed_bath_table', 1036988.68, 11115),
 ('sports_leisure', 988048.97, 8641),
 ('computers_accessories', 911954.32, 7827),
 ('furniture_decor', 729762.49, 8334),
 ('cool_stuff', 635290.85, 3796),
 ('housewares', 632248.66, 6964),
 ('auto', 592720.11, 4235),
 ('garden_tools', 485256.46, 4347)]
In [39]:
# Plotly Sunburst Visualization

revenue_data = pd.DataFrame({
    'product_category': ['health_beauty', 'watches_gifts', 'bed_bath_table', 
                         'sports_leisure', 'computers_accessories', 'furniture_decor', 
                         'cool_stuff', 'housewares', 'auto', 'garden_tools'],
    'revenue': [1258681.34, 1205005.68, 1036988.68, 988048.97, 911954.32, 
                729762.49, 635290.85, 632248.66, 592720.11, 485256.46],
    'order_count': [9670, 5991, 11115, 8641, 7827, 8334, 3796, 6964, 4235, 4347]
})

hover_data={'revenue': True, 'order_count': True}


fig = px.sunburst(revenue_data, 
                  path=['product_category'], 
                  values='revenue', 
                  color='order_count', 
                  color_continuous_scale='Viridis', 
                  title='Top 10 Product Categories: Revenue & Orders',
                  hover_data={'revenue': True, 'order_count': True})
fig.update_traces(textinfo='label+percent parent')  # Show percentage relative to the parent category
fig.update_layout(title_font_size=20)
fig.show()

Explanation is below with donut chart's explanation.

In [41]:
#Donut Chart
fig = px.pie(revenue_data, 
             names='product_category', 
             values='revenue', 
             color='product_category',  
             title='Top 10 Product Categories: Revenue Distribution')

fig.update_traces(hole=0.4, textinfo='percent+label')  

fig.update_layout(title_font_size=20, showlegend=True)

fig.show()

The donut chart illustrates the distribution of revenue among the top 10 product categories in the dataset. Each segment represents a product category, with its size corresponding to the total revenue generated by that category. The percentage labels inside the chart indicate each category's share of the total revenue.

From the chart, we observe the following key insights:

Health & Beauty accounts for the largest share of revenue, contributing approximately 14.9% of the total revenue among the top 10 categories. This is followed by Watches & Gifts and Bed, Bath & Table, which contribute 14.2% and 12.2%, respectively. Other categories such as Sports & Leisure and Computers & Accessories also make significant contributions to overall revenue. This visualization highlights which product categories are the most lucrative, providing valuable insights for potential business decisions, such as inventory management, marketing strategies, or promotional activities.

The pie chart also shows the same things, but it's visually harder to understand than donut chart, because here texts are more visible and overall visualisation is easier to understand.

In [43]:
# Plotly Packed Bubble Chart
fig = px.scatter(revenue_data, 
                 x='product_category', 
                 y='revenue', 
                 size='revenue', 
                 color='product_category', 
                 hover_name='product_category', 
                 size_max=60, 
                 title='Top 10 Product Categories by Revenue (Bubble Chart)')
fig.update_traces(marker=dict(opacity=0.7, line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(title_font_size=20, xaxis_title=None, yaxis_title="Revenue")

fig.update_layout(
    height=600)

fig.show()

In case of buuble chart, we can compare the revenues of each category based on the size of the bubble, the bigger the bubble, the bigger the revene. And as the visualizations are for the same dataset ( I was experimenting to find out which is more beautiful/understandable), we can see the same results as of which category has more revenue.

In [45]:
# Animated bubble chart

import numpy as np

revenue_data = pd.DataFrame({
    'product_category': ['health_beauty', 'watches_gifts', 'bed_bath_table', 'sports_leisure', 'computers_accessories', 
                         'furniture_decor', 'cool_stuff', 'housewares', 'auto', 'garden_tools'],
    'revenue': [1258681.34, 1205005.68, 1036988.68, 988048.97, 911954.32, 729762.49, 635290.85, 632248.66, 592720.11, 485256.46]
})


years = ['2020', '2021', '2022', '2023']
revenue_data_animated = pd.concat([revenue_data.assign(year=year) for year in years], ignore_index=True)


np.random.seed(42)
revenue_data_animated['revenue'] *= (1 + 0.1 * np.random.randn(len(revenue_data_animated)))

fig = px.scatter(
    revenue_data_animated, 
    x='product_category', 
    y='revenue', 
    size='revenue', 
    color='product_category', 
    animation_frame='year',  
    hover_name='product_category', 
    size_max=60, 
    title='Top 10 Product Categories by Revenue Over Time (Bubble Chart)'
)

fig.update_traces(marker=dict(opacity=0.7, line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(
    title_font_size=20, 
    xaxis_title=None, 
    yaxis_title="Revenue", 
    xaxis={'categoryorder': 'total ascending'}
)

fig.update_layout(
    height=600)

fig.show()

This one is the animated version of the bubble chart below. Here the animation frame based on year is added.

In [ ]:
 
In [47]:
# Plotly Funnel Chart
fig = px.funnel(revenue_data, 
                x='revenue', 
                y='product_category', 
                title='Top 10 Product Categories by Revenue (Funnel Chart)', 
                color='product_category')
fig.update_layout(title_font_size=20, xaxis_title="Revenue", yaxis_title=None)

fig.update_layout(
    height=600)

fig.show()

The funnel chart above is a good visualization with the help of which the viewer can easily see the difference of revenues based on product categories. The higher the revenue of the product pategory is, the bigger is its bar and in lower posistion. In my opinion, it is the best visualization in term of easy understanding. It's just simple looking compared of some of the others I had above.

In [49]:
# The radial chart

revenue_data = pd.DataFrame({
    'product_category': ['health_beauty', 'watches_gifts', 'bed_bath_table', 'sports_leisure', 'computers_accessories', 
                         'furniture_decor', 'cool_stuff', 'housewares', 'auto', 'garden_tools'],
    'revenue': [1258681.34, 1205005.68, 1036988.68, 988048.97, 911954.32, 729762.49, 635290.85, 632248.66, 592720.11, 485256.46]
})

revenue_data_closed = revenue_data.copy()
revenue_data_closed = pd.concat([revenue_data, revenue_data.iloc[0:1]], ignore_index=True)


category_order = list(revenue_data['product_category']) + [revenue_data['product_category'][0]]

fig = px.line_polar(
    revenue_data_closed,
    r='revenue',
    theta='product_category',
    line_close=False, 
    title='Revenue by Product Categories (Radial Chart)',
    markers=True,
    color_discrete_sequence=px.colors.sequential.Sunset
)


fig.add_scatterpolar(
    r=[revenue_data_closed['revenue'].iloc[0]],
    theta=[revenue_data_closed['product_category'].iloc[0]],
    mode='lines+markers',
    line=dict(color=px.colors.sequential.Sunset[0])
)


fig.update_traces(marker=dict(size=8))
fig.update_layout(title_font_size=20, polar=dict(angularaxis=dict(categoryorder='array', categoryarray=category_order)))
fig.show()

The radial chart is also visual representation of revenues based on productcategories, the further is the category from center the bigger is the ammount of its revenue.

In [ ]:
 

Product Categories camparison by number of orders in English¶

  • This query calculates the same metrics as query above but ranks categories by the number of orders, not revenue.

Key Insights:

  • bed_bath_table ranks first in order count (11,115 orders), despite being third in revenue. This indicates that it’s a high-demand, low-cost category.
  • health_beauty drops to second in order count, with 9,670 orders, but still leads in revenue due to higher prices.
  • sports_leisure ranks third with 8,641 orders, making it a popular category.
In [52]:
query = """

SELECT product_category_name_translation.product_category_name_english, ROUND(SUM(order_items.price),2), COUNT(*) 
FROM order_items 
JOIN products ON products.product_id = order_items.product_id
JOIN product_category_name_translation ON product_category_name_translation.product_category_name = products.product_category_name
GROUP BY products.product_category_name
ORDER BY COUNT(*) DESC

LIMIT 10 """

cur.execute(query)
cur.fetchall()
Out[52]:
[('bed_bath_table', 1036988.68, 11115),
 ('health_beauty', 1258681.34, 9670),
 ('sports_leisure', 988048.97, 8641),
 ('furniture_decor', 729762.49, 8334),
 ('computers_accessories', 911954.32, 7827),
 ('housewares', 632248.66, 6964),
 ('watches_gifts', 1205005.68, 5991),
 ('telephony', 323667.53, 4545),
 ('garden_tools', 485256.46, 4347),
 ('auto', 592720.11, 4235)]
In [53]:
# Horizontal Bar Chart - Matplotlib

categories = ['bed_bath_table', 'health_beauty', 'sports_leisure', 'furniture_decor',
              'computers_accessories', 'housewares', 'watches_gifts', 'telephony',
              'garden_tools', 'auto']
orders = [11115, 9670, 8641, 8334, 7827, 6964, 5991, 4545, 4347, 4235]

plt.figure(figsize=(10, 6))
plt.barh(categories, orders, color='skyblue')
plt.xlabel('Number of Orders')
plt.ylabel('Product Categories')
plt.title('Product Categories by Number of Orders')
plt.gca().invert_yaxis()  
plt.show()
No description has been provided for this image

The bar chaet below helps us to see product categories based on the number of orders. As we can see the category "bed_bath_table" has the highest number of orders, while "auto" has the lowest.

In [ ]:
 
In [55]:
import pandas as pd
import plotly.express as px

# Assuming you have already executed the SQL query and fetched the results
results = [('bed_bath_table', 1036988.68, 11115),
           ('health_beauty', 1258681.34, 9670),
           ('sports_leisure', 988048.97, 8641),
           ('furniture_decor', 729762.49, 8334),
           ('computers_accessories', 911954.32, 7827),
           ('housewares', 632248.66, 6964),
           ('watches_gifts', 1205005.68, 5991),
           ('telephony', 323667.53, 4545),
           ('garden_tools', 485256.46, 4347),
           ('auto', 592720.11, 4235)]

# Create the DataFrame from the results
df = pd.DataFrame(results, columns=['Product Category', 'Revenue', 'Orders'])

# Create the bubble chart
fig = px.scatter(df, 
                 x='Orders', 
                 y='Revenue', 
                 size='Orders', 
                 color='Product Category',
                 title='Bubble Chart: Product Categories by Orders and Revenue',
                 labels={'Orders': 'Number of Orders', 'Revenue': 'Total Revenue'})
fig.update_layout(
    height=600)

fig.show()

The bubble chart below again shows product categories by orders and revenue. When looking at horizontal axis we can see the numbe rof orders, and by looking at vertical axis we can see the number of total revenue. And beside visually seeing each category's corresponding bubble in the visual in corresponding place based on numbers, we can click on each of bubbles to see exact numbers of that categorts revenue and orders.

Low-performing categories¶

In [58]:
query = """

SELECT product_category_name_translation.product_category_name_english, ROUND(SUM(order_items.price),2), COUNT(*) 
FROM order_items 
JOIN products ON products.product_id = order_items.product_id
JOIN product_category_name_translation ON product_category_name_translation.product_category_name = products.product_category_name
GROUP BY products.product_category_name
ORDER BY SUM(order_items.price) ASC

LIMIT 10 """

cur.execute(query)
cur.fetchall()
Out[58]:
[('security_and_services', 283.29, 2),
 ('fashion_childrens_clothes', 569.85, 8),
 ('cds_dvds_musicals', 730.0, 14),
 ('home_comfort_2', 760.27, 30),
 ('flowers', 1110.04, 33),
 ('diapers_and_hygiene', 1567.59, 39),
 ('arts_and_craftmanship', 1814.01, 24),
 ('la_cuisine', 2054.99, 14),
 ('fashion_sport', 2119.51, 30),
 ('fashio_female_clothing', 2803.64, 48)]
  • Recommentations:

For high performing categories focus marketing and inventory management efforts on health_beauty, watches_gifts, and bed_bath_table as they are the top revenue drivers. For bed_bath_table, explore opportunities to increase per-order value through bundles or upselling since it has high volume but relatively low revenue.

For low-performing categories reassess the viability of maintaining categories like security_and_services and cds_dvds_musicals, which contribute minimal revenue and may not justify storage or marketing costs. Explore strategies like discounts or clearance sales to liquidate inventory in these categories.

Insights for Category Expansion: Consider expanding high-margin categories like health_beauty and watches_gifts by introducing new products or targeting similar customer segments.


In [61]:
#Bar chart

low_performing_categories = [
    ('telephony', 323667.53, 4545),
    ('garden_tools', 485256.46, 4347),
    ('auto', 592720.11, 4235),
    ('watches_gifts', 1205005.68, 5991),
    ('housewares', 632248.66, 6964),
    ('furniture_decor', 729762.49, 8334),
    ('computers_accessories', 911954.32, 7827),
    ('sports_leisure', 988048.97, 8641),
    ('health_beauty', 1258681.34, 9670),
    ('bed_bath_table', 1036988.68, 11115)
]


df_low_performing = pd.DataFrame(low_performing_categories, columns=['Product Category', 'Revenue', 'Orders'])

fig = px.bar(df_low_performing, 
             x='Product Category', 
             y='Revenue', 
             title='Low Performing Product Categories by Revenue',
             labels={'Revenue': 'Total Revenue', 'Product Category': 'Product Category'},
             color='Product Category')

fig.show()

Here again we have bar chart where we can visually understand the revenue of categores. This time the categories are 10 least performing ones, and the comparison is between them.

Sellers¶

  • There are 3,095 sellers in the dataset.
In [64]:
query = "SELECT COUNT(*) FROM sellers;"
pd.read_sql(query, conn)
Out[64]:
COUNT(*)
0 3095

Top 10 Sellers by Number of Orders¶

  • The query below ranks sellers by the number of orders handled. It also provides the total revenue generated by each seller.

  • Key Insights:

The seller '6560211a19b47992c3666cc44a7e94c0' handled the highest number of orders (2,033) and generated 123,304.83 in revenue.

The second seller '4a3ca9315b744ce9f8e9374361493884' had slightly fewer orders (1,987) but earned significantly higher revenue (200,472.92), suggesting they sell higher-priced products.

The sellers listed handle high order volumes, with counts ranging from 1,171 to 2,033, making them major contributors to the platform's activity.

In [66]:
query = """

SELECT seller_id, SUM(price), COUNT(seller_id) FROM order_items
GROUP BY seller_id
ORDER BY COUNT(seller_id) DESC

LIMIT 10 """

cur.execute(query)
cur.fetchall()
Out[66]:
[('6560211a19b47992c3666cc44a7e94c0', 123304.83000000003, 2033),
 ('4a3ca9315b744ce9f8e9374361493884', 200472.9199999949, 1987),
 ('1f50f920176fa81dab994f9023523100', 106939.20999999739, 1931),
 ('cc419e0650a3c5ba77189a1882b7556a', 104288.42000000263, 1775),
 ('da8622b14eb17ae2831f4ac5b9dab84a', 160236.56999999538, 1551),
 ('955fee9216a65b617aa5c0531780ce60', 135171.69999999914, 1499),
 ('1025f0e2d44d7041d6cf58b6550e0bfa', 138968.5499999995, 1428),
 ('7c67e1448b00f6e969d365cea6b010ab', 187923.8899999995, 1364),
 ('ea8482cd71df3c1969d7b9473ff13abc', 37177.52000000046, 1203),
 ('7a67c85e85bb2ce8582c35f2203ad736', 141745.53000000177, 1171)]

Top 10 Sellers by Total Revenue¶

  • The query below ranks sellers by their total revenue instead of order count. It includes the number of orders to give additional context.

  • Key Insights:

The top seller by revenue is '4869f7a5dfa277a7dca6462dcf3b52b2', generating 229,472.63 from 1,156 orders. Despite fewer orders compared to the highest order-volume seller, this seller generates significantly more revenue, indicating high-value sales.

The second seller '53243585a1d6dc2643021fd1853d8905' earned 222,776.05 from just 410 orders, showing an even higher average order value than the first.

Sellers with high revenue don't necessarily overlap with those handling the most orders, reinforcing that product pricing and value per order differ widely between sellers.

In [68]:
query = """

SELECT seller_id, SUM(price), COUNT(seller_id) FROM order_items
GROUP BY seller_id
ORDER BY SUM(price) DESC

LIMIT 10 """

cur.execute(query)
cur.fetchall()
Out[68]:
[('4869f7a5dfa277a7dca6462dcf3b52b2', 229472.6299999981, 1156),
 ('53243585a1d6dc2643021fd1853d8905', 222776.04999999952, 410),
 ('4a3ca9315b744ce9f8e9374361493884', 200472.9199999949, 1987),
 ('fa1c13f2614d7b5c4749cbc52fecda94', 194042.02999999846, 586),
 ('7c67e1448b00f6e969d365cea6b010ab', 187923.8899999995, 1364),
 ('7e93a43ef30c4f03f38b393420bc753a', 176431.86999999982, 340),
 ('da8622b14eb17ae2831f4ac5b9dab84a', 160236.56999999538, 1551),
 ('7a67c85e85bb2ce8582c35f2203ad736', 141745.53000000177, 1171),
 ('1025f0e2d44d7041d6cf58b6550e0bfa', 138968.5499999995, 1428),
 ('955fee9216a65b617aa5c0531780ce60', 135171.69999999914, 1499)]

Order Status¶

In [71]:
query = "SELECT * FROM orders LIMIT 5;"
pd.read_sql(query, conn)
Out[71]:
order_id customer_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 00:00:00
1 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13 00:00:00
2 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04 00:00:00
3 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59 2017-11-22 13:39:59 2017-12-02 00:28:42 2017-12-15 00:00:00
4 ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c delivered 2018-02-13 21:18:39 2018-02-13 22:20:29 2018-02-14 19:46:34 2018-02-16 18:17:02 2018-02-26 00:00:00
In [72]:
query = """
SELECT order_status FROM orders 
GROUP BY order_status
ORDER BY order_status
"""
cur.execute(query)
cur.fetchall()
Out[72]:
[('approved',),
 ('canceled',),
 ('created',),
 ('delivered',),
 ('invoiced',),
 ('processing',),
 ('shipped',),
 ('unavailable',)]

Top Sellers by Revenue based on "delivered" items.¶

  • Seller 4a3ca9315b744ce9f8e9374361493884 handled the most orders (1,949), but Seller 4869f7a5dfa277a7dca6462dcf3b52b2 generated the highest revenue ($226,987.93).
  • There’s a large variation in the number of orders per seller, suggesting differences in business models or product pricing.
In [74]:
query = """

SELECT seller_id, SUM(price), COUNT(*) FROM order_items
JOIN orders ON orders.order_id = order_items.order_id
WHERE order_status = 'delivered'
GROUP BY seller_id 
ORDER BY SUM(price) DESC

LIMIT 5
"""

cur.execute(query)
cur.fetchall()
Out[74]:
[('4869f7a5dfa277a7dca6462dcf3b52b2', 226987.92999999813, 1148),
 ('53243585a1d6dc2643021fd1853d8905', 217940.43999999968, 400),
 ('4a3ca9315b744ce9f8e9374361493884', 196882.11999999516, 1949),
 ('fa1c13f2614d7b5c4749cbc52fecda94', 190917.13999999838, 579),
 ('7c67e1448b00f6e969d365cea6b010ab', 186570.04999999996, 1355)]

Order Status Analysis by Number¶

  • From the results below we see that the majority of orders (96,478) are delivered, indicating successful transactions.

  • A significant number of orders are canceled (625) or marked as unavailable (609), which might represent issues in fulfillment or customer cancellations.

  • Low number of "created" status indicates rapid process of orders and delivery.

In [77]:
query = """
SELECT order_status, COUNT(*) FROM orders 
GROUP BY order_status
ORDER BY order_status
"""
cur.execute(query)
cur.fetchall()
Out[77]:
[('approved', 2),
 ('canceled', 625),
 ('created', 5),
 ('delivered', 96478),
 ('invoiced', 314),
 ('processing', 301),
 ('shipped', 1107),
 ('unavailable', 609)]

Top Products by revenue for Delivered items¶

  • Product bb50f2e236e5eea0100680137654686c generates the highest revenue but is not the most frequently sold (194 orders).
  • From the 10 highest revenue generating delivered items, product 99a4788cb24856965c36a24e339b6058 has the highest sales volume (477 orders) but ranks fifth in revenue, suggesting lower pricing.
In [79]:
query = """

SELECT product_id, SUM(price), COUNT(*) FROM order_items
JOIN orders ON orders.order_id = order_items.order_id
WHERE order_status = 'delivered'
GROUP BY product_id
ORDER BY SUM(price) DESC

LIMIT 10

"""

cur.execute(query)
cur.fetchall()
Out[79]:
[('bb50f2e236e5eea0100680137654686c', 63560.0, 194),
 ('6cdd53843498f92890544667809f1595', 53652.3000000001, 153),
 ('d6160fb7873f184099d9bc95e30376af', 45949.35, 33),
 ('d1c427060a0f73f6b889a5c7c61f2ac4', 45620.55999999998, 332),
 ('99a4788cb24856965c36a24e339b6058', 42049.66000000035, 477),
 ('3dd2a17168ec895c781a9191c1e95ad7', 40782.80000000021, 272),
 ('25c38557cf793876c5abdd5931f922db', 38907.32000000001, 38),
 ('5f504b3a1c75b73d6151be81eb05bdc9', 37733.90000000001, 63),
 ('53b36df67ebb7c41585e8d54d6772e08', 37454.630000000136, 321),
 ('aca2eb7d00ea1a7b8ebd4e68314663af', 37104.300000000294, 520)]
In [ ]:
 
In [80]:
query = "SELECT * FROM order_reviews LIMIT 5;"
pd.read_sql(query, conn)
Out[80]:
review_id order_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp
0 7bc2406110b926393aa56f80a40eba40 73fc7af87114b39712e6da79b0a377eb 4 None None 2018-01-18 00:00:00 2018-01-18 21:46:59
1 80e641a11e56f04c1ad469d5645fdfde a548910a1c6147796b98fdf73dbeba33 5 None None 2018-03-10 00:00:00 2018-03-11 03:05:13
2 228ce5500dc1d8e020d8d1322874b6f0 f9e4b658b201a9f2ecdecbb34bed034b 5 None None 2018-02-17 00:00:00 2018-02-18 14:36:24
3 e64fb393e7b32834bb789ff8bb30750e 658677c97b385a9be170737859d3511b 5 None Recebi bem antes do prazo estipulado. 2017-04-21 00:00:00 2017-04-21 22:02:06
4 f7c4243c7fe1938f181bec41a392bdeb 8e6bfb81e283fa7e4f11123a3fb894f1 5 None Parabéns lojas lannister adorei comprar pela I... 2018-03-01 00:00:00 2018-03-02 10:26:53

Review Score Analysis¶

From the query below we conclude that the majority of orders received a high review score of 5 (57328), accounting for the largest percent of reviews (57.78%).

In [82]:
query = """

SELECT order_reviews.review_score, COUNT(*),
ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM order_reviews), 2) AS percent 
FROM orders
JOIN order_reviews on order_reviews.order_id = orders.order_id
GROUP BY review_score
ORDER BY review_score


"""

cur.execute(query)
cur.fetchall()
pd.read_sql(query, conn)
Out[82]:
review_score COUNT(*) percent
0 1 11424 11.51
1 2 3151 3.18
2 3 8179 8.24
3 4 19142 19.29
4 5 57328 57.78
In [83]:
# Interactive pie chart

data = {
    'review_score': [1, 2, 3, 4, 5],
    'count': [11424, 3151, 8179, 19142, 57328],
    'percent': [11.51, 3.18, 8.24, 19.29, 57.78]
}

df = pd.DataFrame(data)

fig = px.pie(df, 
             names='review_score', 
             values='count',
             hover_data={'percent': True},
             title="Review Score Distribution")

fig.update_traces(textinfo='percent+label', pull=[0.1, 0, 0, 0, 0])  


fig.show()

With the help of this pie chart we can see the percentage of teach review score visually.

In [85]:
# A line plot for percentage changes over time

fig = px.line(df, 
              x='review_score', 
              y='percent', 
              markers=True,
              title="Review Score Percentages",
              labels={'review_score': 'Review Score', 'percent': 'Percentage of Reviews'})

fig.show()

Here we can also see the percantage change in case of revies scores, but I think pie chart is visually better as we have only 5 options in this case.

In [87]:
# Interactive bar chart

data = {
    'review_score': [1, 2, 3, 4, 5],
    'count': [11424, 3151, 8179, 19142, 57328],
    'percent': [11.51, 3.18, 8.24, 19.29, 57.78]
}

df = pd.DataFrame(data)

fig = px.bar(df, 
             x='review_score', 
             y='count', 
             color='review_score',
             hover_data={'percent': True},
             labels={'review_score': 'Review Score', 'count': 'Number of Reviews'},
             title="Review Score Distribution",
             text='percent')

fig.update_traces(texttemplate='%{text}%', textposition='outside')

fig.update_layout(
    height=600)

fig.show()

Here I have bar chart again showing the revoew score percentages, the bigger the bar the bigger the percentage. And the percantages are also wriiten above each bar.

In [ ]:
 
In [ ]:
 

Sellers City Order Analysis (All Orders)¶

In [90]:
query = """

SELECT sellers.seller_city, 
SUM(price) AS Sales, 
COUNT(*) AS Count
FROM order_items	
JOIN sellers ON sellers.seller_id = order_items.seller_id
GROUP BY sellers.seller_city 
ORDER BY SUM(price) DESC


"""

cur.execute(query)
cur.fetchall()

pd.read_sql(query, conn)
Out[90]:
seller_city Sales Count
0 sao paulo 2702878.14 27983
1 ibitinga 624592.94 7750
2 curitiba 470759.82 3016
3 rio de janeiro 358413.59 2442
4 guarulhos 329494.38 2362
... ... ... ...
606 resende 19.99 1
607 floranopolis 19.98 2
608 jarinu 14.63 1
609 macatuba 13.00 1
610 palotina 9.99 1

611 rows × 3 columns

Seller City Order Analysis (Delivered Orders Only)¶

In [92]:
query = """

SELECT sellers.seller_city, 
SUM(price) AS Sales, 
COUNT(*) AS Count
FROM order_items	
JOIN sellers ON sellers.seller_id = order_items.seller_id
JOIN orders ON orders.order_id = order_items.order_id
WHERE orders.order_status = 'delivered'
GROUP BY sellers.seller_city 
ORDER BY SUM(price) DESC

LIMIT 10

"""

cur.execute(query)
cur.fetchall()
pd.read_sql(query, conn)
Out[92]:
seller_city Sales Count
0 sao paulo 2628634.52 27357
1 ibitinga 614836.09 7621
2 curitiba 458812.03 2955
3 rio de janeiro 342341.64 2356
4 guarulhos 318217.59 2309
5 ribeirao preto 267478.85 2208
6 itaquaquecetuba 228460.34 1639
7 guariba 226987.93 1148
8 santo andre 223280.71 2886
9 lauro de freitas 220689.44 401
  • Comparison: All Orders vs. Delivered Orders

Sao Paulo remains the top-performing city in both total sales and transaction count, contributing the most significant share.

Sales figures for "delivered orders" are slightly lower than those for "all orders," indicating some incomplete or canceled orders, particularly in larger markets.

Ibatinga, Curitiba, and Rio de Janeiro consistently follow Sao Paulo, showcasing high transactional volume and revenue.

  • Insights and Recommendations:

Top Markets (Sao Paulo, Ibatinga, Curitiba): These cities contribute significantly to revenue and sales volume. Strategies such as targeted marketing campaigns, loyalty programs, or promotional offers in these regions could amplify growth further.

Smaller Markets (Guariba, Lauro de Freitas): Cities like Guariba and Lauro de Freitas have high average order values despite lower transaction counts. Investigating seller or product types in these areas could reveal niche opportunities to expand.

Delivered Orders vs. All Orders: A small proportion of sales occurs from non-delivered orders. Investigate order cancellations or pending orders in top-performing cities (e.g., Sao Paulo or Ibatinga) to address potential inefficiencies.

In [95]:
# Matplotlib Visualization

query_city = """
SELECT sellers.seller_city, 
SUM(price) AS Sales, 
COUNT(*) AS Count
FROM order_items	
JOIN sellers ON sellers.seller_id = order_items.seller_id
JOIN orders ON orders.order_id = order_items.order_id
WHERE orders.order_status = 'delivered'
GROUP BY sellers.seller_city 
ORDER BY SUM(price) DESC
LIMIT 10
"""
city_data = pd.read_sql(query_city, conn)


plt.figure(figsize=(10, 6))
plt.barh(city_data['seller_city'], city_data['Sales'], color='darkorange')
plt.xlabel('Total Sales (in BRL)', fontsize=12)
plt.ylabel('Seller City', fontsize=12)
plt.title('Top 10 Seller Cities by Sales (Delivered Orders)', fontsize=16)
plt.gca().invert_yaxis()  
plt.tight_layout()
plt.show()
No description has been provided for this image

With the help of this chart we can easily see top seller cities by sales. The bigger the bar the bigger the revenue.

In [97]:
# Seaborn Visualization
plt.figure(figsize=(10, 6))
sns.barplot(x='Count', y='seller_city', data=city_data, palette='coolwarm')
plt.xlabel('Number of Delivered Orders', fontsize=12)
plt.ylabel('Seller City', fontsize=12)
plt.title('Top 10 Seller Cities by Number of Delivered Orders', fontsize=16)
plt.tight_layout()
plt.show()
No description has been provided for this image

Here we can see the same results, I used seaborn here, wereas in earlieer chart I used matplotlip visualization.

In [99]:
# Plotly Visualization
fig = px.bar(city_data, 
             x='Sales', 
             y='seller_city', 
             orientation='h', 
             color='Sales',
             color_continuous_scale='Viridis',
             title='Top 10 Seller Cities by Sales (Delivered Orders)',
             labels={'Sales': 'Total Sales (in BRL)', 'seller_city': 'Seller City'})
fig.update_layout(title_font_size=20, 
                  xaxis_title='Total Sales (in BRL)', 
                  yaxis_title='Seller City', 
                  yaxis=dict(categoryorder='total ascending'))
fig.show()

Here again we can see the the visualization showing the top 10 seller cities by revenue again, and this time I used Plotly .

In [ ]:
 

Number of total orders, delivered orders and the percent of delivered orders¶

In [102]:
query = """

SELECT 
Count(order_id) AS 'Total orders',
(SELECT COUNT(*) FROM orders WHERE orders.order_status = 'delivered') AS 'Delivered Orders',
(SELECT COUNT(*) FROM orders WHERE orders.order_status = 'delivered')*100 / Count(order_id) AS 'Delivered %'
FROM orders

"""
cur.execute(query)
cur.fetchall()

pd.read_sql(query, conn)
Out[102]:
Total orders Delivered Orders Delivered %
0 99441 96478 97

Number and percentage of canceled orders¶

In [104]:
query = """

SELECT 
Count(order_id) AS 'Total orders',
(SELECT COUNT(*) FROM orders WHERE orders.order_status = 'canceled') AS 'Canceled Orders',
(SELECT COUNT(*) FROM orders WHERE orders.order_status = 'canceled')*100 / Count(order_id) AS 'Canceled %'
FROM orders

"""
cur.execute(query)
cur.fetchall()
Out[104]:
[(99441, 625, 0)]

Order Status Analysis¶

Total orders: 99,441 (This is the total number of orders in the system.)

Delivered orders: 96,478 Delivered percentage: 97% This means that a significant portion (97%) of the orders have been successfully delivered, indicating an efficient order fulfillment process.

Canceled orders: 625 Canceled percentage: 0% The number of canceled orders is minimal, only 0.63% of the total orders, which is a good indicator of a relatively low cancellation rate.

  • Summary Insights:

High delivery success rate: 97% of orders are delivered, suggesting that the system is functioning well in terms of processing and shipping orders. Low cancellation rate: With only 625 canceled orders, the cancellation rate is very low (0%), which can be considered a positive sign in terms of operational efficiency and customer satisfaction.

In [ ]:
 

Analysis of Qualified Leads by Origin¶

In [108]:
query = """

SELECT origin, 
COUNT(*),
ROUND(CAST(COUNT(*)*100.0/ (SELECT COUNT(*) FROM leads_qualified) AS FLOAT),3) AS Percentage
FROM leads_qualified

GROUP BY origin

ORDER BY COUNT(*) DESC

"""
cur.execute(query)
cur.fetchall()

pd.read_sql(query, conn)
Out[108]:
origin COUNT(*) Percentage
0 organic_search 2296 28.700
1 paid_search 1586 19.825
2 social 1350 16.875
3 unknown 1099 13.738
4 direct_traffic 499 6.238
5 email 493 6.163
6 referral 284 3.550
7 other 150 1.875
8 display 118 1.475
9 other_publicities 65 0.813
10 None 60 0.750
In [109]:
# The donut chart

query = """
SELECT origin, 
COUNT(*),
ROUND(CAST(COUNT(*)*100.0/ (SELECT COUNT(*) FROM leads_qualified) AS FLOAT),3) AS Percentage
FROM leads_qualified
GROUP BY origin
ORDER BY COUNT(*) DESC
"""

origin_data = pd.read_sql(query, conn)

fig = px.pie(origin_data, values='Percentage', names='origin', 
             title='Qualified Leads Percentage by Origin', 
             hole=0.4)  
fig.update_traces(textinfo='percent+label')
fig.update_layout(title_font_size=20)
fig.show()

With the help of donut chart we can see the qualified leads percentage by origin, each one has its defined color, and the percentages are written in a way that visually are easy to see and understand immediately.

Analysis of Closed Leads by Business Segment¶

In [112]:
query = """

SELECT COUNT(*), business_segment,
ROUND(COUNT(*) *100.0 / (SELECT COUNT(*) FROM leads_closed),2)  AS 'Percentage'
FROM leads_closed
GROUP BY business_segment
ORDER BY  COUNT(*) DESC

LIMIT 10

"""
cur.execute(query)
cur.fetchall()

pd.read_sql(query, conn)
Out[112]:
COUNT(*) business_segment Percentage
0 105 home_decor 12.47
1 93 health_beauty 11.05
2 77 car_accessories 9.14
3 71 household_utilities 8.43
4 69 construction_tools_house_garden 8.19
5 64 audio_video_electronics 7.60
6 34 computers 4.04
7 30 pet 3.56
8 28 food_supplement 3.33
9 26 food_drink 3.09

Analysis for two above queries¶

leads qualified by origin¶

1.Top Origins by Count:

Organic Search: 2,296 leads (28.7% of the total leads) Paid Search: 1,586 leads (19.83%) Social: 1,350 leads (16.88%) Unknown: 1,099 leads (13.74%) These origins are significant in driving qualified leads. Organic search, paid search, and social media contribute almost 65% of the total leads.

2.Other Origins:

Direct Traffic: 499 leads (6.24%) Email: 493 leads (6.16%) Referral: 284 leads (3.55%) Display: 118 leads (1.48%) Other Publicities: 65 leads (0.81%) Unknown: 60 leads (0.75%) These categories represent a smaller portion of the leads but are still valuable, especially for understanding how less conventional sources contribute to lead generation.

Leads Closed by Business Segment:¶

1.Top Business Segments by Count:

Home Decor: 105 closed leads (12.47%) Health & Beauty: 93 closed leads (11.05%) Car Accessories: 77 closed leads (9.14%) Household Utilities: 71 closed leads (8.43%) Construction Tools & House/Garden: 69 closed leads (8.19%) These business segments make up a substantial portion of closed leads. Home Decor and Health & Beauty together account for 23.5% of the closed leads.

  1. Other Business Segments:

Audio/Video Electronics: 64 leads (7.60%) Computers: 34 leads (4.04%) Pet: 30 leads (3.56%) Food Supplement: 28 leads (3.33%) Food & Drink: 26 leads (3.09%) Sports & Leisure: 25 leads (2.97%) Fashion Accessories: 20 leads (2.38%) Several other segments contribute a smaller but still meaningful portion to the business, with notable entries like Food & Drink, Pet, and Sports & Leisure.

  • Insights & Recommendations:

Marketing Focus: The top three origins (Organic Search, Paid Search, and Social) are the primary drivers of qualified leads. Continuing to optimize for these channels, with a particular focus on improving organic search rankings and paid search campaigns, can yield even more qualified leads.

Efforts to further investigate and enhance the "Unknown" category could uncover valuable opportunities for better lead classification and targeting.

Sales Segmentation: Home Decor, Health & Beauty, and Car Accessories are the leading business segments in terms of closed leads. These segments should be prioritized for further marketing and sales efforts, as they demonstrate strong conversion potential.

Other segments like Audio/Video Electronics, Computers, and Pet could be explored for different campaigns that might improve conversions or attract more leads.

In [114]:
query_segment = """
    SELECT COUNT(*) AS lead_count, business_segment,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM leads_closed), 2) AS percentage
    FROM leads_closed
    GROUP BY business_segment
    ORDER BY lead_count DESC
    LIMIT 10;
"""
segment_data = pd.read_sql(query_segment, conn)
In [115]:
#barplot
plt.figure(figsize=(10, 6))
plt.barh(segment_data['business_segment'], segment_data['lead_count'], color='teal')
plt.xlabel('Number of Closed Leads', fontsize=12)
plt.ylabel('Business Segment', fontsize=12)
plt.title('Closed Leads by Business Segment', fontsize=16)
plt.gca().invert_yaxis()  
plt.tight_layout()
plt.show()
No description has been provided for this image
In [116]:
fig = px.pie(segment_data, values='percentage', names='business_segment', 
             title='Closed Leads Percentage by Business Segment')
fig.update_traces(textinfo='percent+label')
fig.update_layout(title_font_size=20)
fig.show()

Here in case of both the bar plot and pie plot we can visually see the closed leads percentage bu business segment, the bigger the size of bar/pie part, the bigger the percentage. So home_decor has highest percentage, and food_drink has lowest among 10.

In [ ]:
 
In [ ]: